import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)By conducting these analyses, we can gain insights into:
- The leading institutions in a particular research area
- The evolving trends in research
- The areas of collaboration and expertise
- The key themes and topics being explored
Loading Dataset
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_colwidth', 100)# full dataset
df = pd.read_csv('data/bcas_dataset_fin.csv')df.shape(7216, 18)
df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7216 entries, 0 to 7215
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 url 7216 non-null object
1 date 7216 non-null int64
2 views 7216 non-null int64
3 downloads 7216 non-null int64
4 author_cn 7209 non-null object
5 author_en 7216 non-null object
6 title_cn 7216 non-null object
7 title_en 3384 non-null object
8 org_cn 4104 non-null object
9 org_en 2150 non-null object
10 abstract_cn 4750 non-null object
11 abstract_en 2233 non-null object
12 keywords_cn 5080 non-null object
13 keywords_en 2145 non-null object
14 fund_project 1054 non-null object
15 similar 7128 non-null object
16 issue 7216 non-null object
17 page 7216 non-null object
dtypes: int64(3), object(15)
memory usage: 1014.9+ KB
df = df.rename(columns={'date': 'year'})
# filter out 2024 as it's not finished yet
df = df[df['year'] < 2024]df.shape(7048, 18)
Publication Trends
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
pio.templates.default = "ggplot2"
# layout settings
def set_layout():
return {
'width': 800,
'height': 400,
'xaxis': {
'title': '',
'tickmode': 'array',
'showline': True,
'linecolor': 'black'
},
'yaxis': {
'title': '',
'showline': True,
'linecolor': 'black'
},
'font': {
'size': 14,
'family': "Verdana"
},
'margin': {'r': 0, 'l': 0, 't': 50, 'b': 50}
}
layout = set_layout()gp = df.groupby('year')['title_cn'].count().reset_index()
fig = px.line(
gp,
x='year',
y='title_cn',
)
fig.update_layout(
layout,
title='Number of Publications by Year, 1986-2023',
xaxis=dict(
type='category',
title="",
minor=dict(ticks="inside", showgrid=True),
),
yaxis=dict(title="Publications")
)
fig.update_traces(
line_color='#0E86D4',
line=dict(width=3),
hovertemplate='Year: %{x}<br>Articles: %{y}<extra></extra>'
)
fig.show()Views & Downloads
fig = px.histogram(df, x="views", nbins=400)
fig.update_layout(
layout,
title='Distribution of Article Views',
yaxis=dict(
title="Count"
),
xaxis=dict(
title="Views",
tickangle=0
),
)
fig.update_traces(
marker_color='#0E86D4',
hovertemplate='Views: %{x}<br>Count: %{y}<extra></extra>'
)
fig.show()fig = px.scatter(
df,
x="views",
y="downloads",
log_x=False
)
fig.update_layout(
layout,
title='Relationship Between Article Views and Downloads',
xaxis_title="Views",
yaxis_title="Downloads"
)
fig.update_traces(
marker_color='#0E86D4',
hovertemplate='Views: %{x}<br>Downloads: %{y}<extra></extra>'
)
fig.show()gp = df.groupby('year')[['views', 'downloads']].sum().reset_index()
fig = go.Figure()
fig.add_trace(go.Scatter(
x=gp['year'],
y=gp['views'],
name='Views',
line=dict(color='#0E86D4', width=3),
hovertemplate='Year: %{x}<br>Views: %{y}<extra></extra>'
))
fig.add_trace(go.Scatter(
x=gp['year'],
y=gp['downloads'],
name='Downloads',
line=dict(color='#FF3131', width=3),
hovertemplate='Year: %{x}<br>Downloads: %{y}<extra></extra>'
))
fig.update_layout(
layout,
width=900,
title='Views and Downloads Trend, 1986-2023',
xaxis=dict(
type='category',
tickmode='array',
title="",
minor=dict(ticks="inside", showgrid=True),
),
yaxis=dict(title="Count"),
)
fig.show()# calculate avg downloads and views per article for each year
gp = df.groupby('year')[['views', 'downloads']].sum().reset_index()
gp['articles'] = df.groupby('year').size().values
gp['views_avg'] = gp.views/gp.articles
gp['downloads_avg'] = gp.downloads/gp.articles
fig = go.Figure()
fig.add_trace(go.Scatter(
x=gp['year'],
y=gp['views_avg'],
name='Views per Article',
line=dict(color='#0E86D4', width=3),
hovertemplate='Year: %{x}<br>Views Avg: %{y}<extra></extra>'
))
fig.add_trace(go.Scatter(
x=gp['year'],
y=gp['downloads_avg'],
name='Downloads per Article',
line=dict(color='#FF3131', width=3),
hovertemplate='Year: %{x}<br>Downloads Avg: %{y}<extra></extra>'
))
fig.update_layout(
layout,
width=950,
title='Views and Downloads per Article Trend, 1986-2023',
xaxis=dict(
type='category',
title="",
minor=dict(ticks="inside", showgrid=True)
),
yaxis=dict(
title="Count",
)
)
fig.show()Organizations
General Statistics
orgs = pd.read_csv('data/orgs_flat.csv')
orgs = orgs[orgs['year'] < 2024]gp = orgs.groupby('year')['orgs_head'].nunique().reset_index()
fig = px.line(gp, x='year', y='orgs_head', title='')
fig.update_layout(
layout,
title='Organization Count by Year, 1986-2023',
xaxis=dict(
type='category',
minor=dict(ticks="inside", showgrid=True)
),
yaxis=dict(
title="Organizations"
)
)
fig.update_traces(
line_color='#0E86D4',
line=dict(width=3),
hovertemplate='Year: %{x}<br>Organizations: %{y}<extra></extra>'
)
fig.show()article_total_year = df.groupby('year')['title_cn'].count().reset_index()
orgs_per_year = orgs.groupby('year')['orgs_head'].nunique().reset_index()
orgs_per_year['orgs_per_article'] = orgs_per_year['orgs_head'] / \
article_total_year['title_cn']
fig = px.line(orgs_per_year, x='year', y='orgs_per_article', title='')
fig.update_layout(
layout,
xaxis=dict(
type='category',
minor=dict(ticks="inside", showgrid=True)
),
yaxis=dict(
title="Organizations per Article"
)
)
fig.update_traces(
line_color='#0E86D4',
line=dict(width=3),
hovertemplate='Year: %{x}<br>Organizations per Article: %{y:.2f}<extra></extra>'
)
fig.show()# filter out entries outside of 2013-2023 period
orgs = orgs[(orgs['year'] > 2012) & (orgs['year'] < 2024)]gp = orgs.orgs_head.value_counts().sort_values(
ascending=False).head(10).reset_index()
# Create the bar chart
fig = px.bar(
gp,
x='count',
y='orgs_head',
orientation='h'
)
fig.update_layout(
layout,
width=1200,
height=400,
xaxis=dict(
title="Frequency",
range=[0, 500]
),
yaxis=dict(
autorange="reversed"
),
title='Top 10 Organizations by Frequency, 2013-2023',
)
fig.update_traces(
textposition='outside',
texttemplate='%{x}',
marker_color='#0E86D4',
textfont=dict(color='black'),
cliponaxis=True,
opacity=0.7,
hovertemplate='Organization: %{y}<br>Count: %{x}<extra></extra>'
)
fig.show()Geospatial Analysis
orgs = orgs[(orgs['year'] > 2012) & (orgs['year'] < 2024)]
orgs.head()| url | org_cn | city_cn | city_en | org_cn_head | orgs_head | title_cn | year | |
|---|---|---|---|---|---|---|---|---|
| 314 | http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/view_abstract.aspx?file_no=20230101&flag=1 | 中国石油勘探开发研究院 | 北京 | Beijing | 中国石油勘探开发研究院 | Research Institute of Petroleum Exploration and Development | 油气安全战略与“双碳”战略:关系与路径 | 2023 |
| 315 | http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/view_abstract.aspx?file_no=20230101&flag=1 | 国家油气战略研究中心 | 北京 | Beijing | 国家油气战略研究中心 | National Oil and Gas Strategic Research Center | 油气安全战略与“双碳”战略:关系与路径 | 2023 |
| 316 | http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/view_abstract.aspx?file_no=20230102&flag=1 | 深部煤矿采动响应与灾害防控国家重点实验室 | 淮南 | Huainan | 深部煤矿采动响应与灾害防控国家重点实验室 | State Key Laboratory of Mining Response and Disaster Prevention In Deep Coal Mines | 我国煤炭主体能源安全高质量发展的理论技术思考 | 2023 |
| 317 | http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/view_abstract.aspx?file_no=20230103&flag=1 | 中国煤炭科工集团有限公司 | 北京 | Beijing | 中国煤炭科工集团有限公司 | China Coal Technology Engineering Group (Ccteg) | 新形势下我国能源高质量发展与能源安全 | 2023 |
| 318 | http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/view_abstract.aspx?file_no=20230103&flag=1 | 煤炭科学研究总院 | 北京 | Beijing | 煤炭科学研究总院 | China Coal Research Institute | 新形势下我国能源高质量发展与能源安全 | 2023 |
from urllib.request import urlopen
import json
with urlopen('https://unpkg.com/cn-atlas@0.1.2/prefectures.json') as response:
cities = json.load(response)import plotly.express as px
gp = orgs.groupby(['year', 'city_en'])['orgs_head'].nunique().reset_index()
fig = px.choropleth_mapbox(
data_frame=gp,
geojson=cities,
color='orgs_head',
locations="city_en",
featureidkey="properties.name",
mapbox_style="carto-positron",
color_continuous_scale='dense',
center={"lat": 37.110573, "lon": 106.493924},
zoom=3,
)
fig.update_layout(
layout,
title='Organizations by City, 2013-2023',
width=800,
height=500,
coloraxis_colorbar=dict(title='Count'),
margin=dict(r=10, l=10, t=50, b=10)
)
fig.update_traces(
marker_line_width=1,
marker_line_color='black',
hovertemplate='City: %{location}<br>Organizations: %{z}<extra></extra>'
)
fig.show()import plotly.express as px
gp = orgs['city_en'].value_counts().reset_index()
fig = px.choropleth_mapbox(
data_frame=gp,
geojson=cities,
color='count',
locations="city_en",
featureidkey="properties.name",
mapbox_style="carto-positron",
color_continuous_scale='dense',
center={"lat": 37.110573, "lon": 106.493924},
zoom=3,
)
fig.update_layout(
layout,
title='Total Number of Affiliations by City, 2013-2023',
width=800,
height=500,
coloraxis_colorbar=dict(title='Count'),
margin=dict(r=10, l=10, t=50, b=10)
)
fig.update_traces(
marker_line_width=1,
marker_line_color='black',
hovertemplate='City: %{location}<br>Organizations: %{z}<extra></extra>'
)
fig.show()Collaboration Network
# keep only the rows where the number of unique values in the 'orgs_head' column, grouped by the 'title_cn' column, is greater than 1
orgs_filtered = orgs[orgs.groupby(
'title_cn')['orgs_head'].transform('nunique') > 1]
orgs_filtered = orgs_filtered[['year', 'orgs_head', 'title_cn']]
gp = orgs_filtered.groupby(
['year', 'title_cn']).orgs_head.unique().reset_index()from itertools import combinations
# initialize list to hold collaboration pairs
collaboration_pairs = []
# iterate through the grouped data
for idx, row in gp.iterrows():
orgs = row['orgs_head']
title = row['title_cn']
year = row['year']
# generate all possible unique pairs of organizations
if len(orgs) > 1:
for pair in combinations(orgs, 2):
collaboration_pairs.append((year, title, pair[0], pair[1]))
# create a df from the pairs
collaboration_df = pd.DataFrame(collaboration_pairs, columns=[
'year', 'title_cn', 'org1', 'org2'])collaboration_df.head()| year | title_cn | org1 | org2 | |
|---|---|---|---|---|
| 0 | 2013 | 中国地理信息系统的发展与展望 | Institute of Geographic Sciences and Natural Resources Research, CAS | Institute of Remote Sensing and Digital Earth, CAS |
| 1 | 2013 | 中国地理信息系统的发展与展望 | Institute of Geographic Sciences and Natural Resources Research, CAS | National University of Defense Technology |
| 2 | 2013 | 中国地理信息系统的发展与展望 | Institute of Geographic Sciences and Natural Resources Research, CAS | Wuhan University |
| 3 | 2013 | 中国地理信息系统的发展与展望 | Institute of Remote Sensing and Digital Earth, CAS | National University of Defense Technology |
| 4 | 2013 | 中国地理信息系统的发展与展望 | Institute of Remote Sensing and Digital Earth, CAS | Wuhan University |
# calculate collaboration strength (=number of occurrences for a pair)
# group by year and organization pair, then count collaborations
collaboration_strength = collaboration_df.groupby(
['year', 'org1', 'org2']).size().reset_index(name='strength')
# ensure each pair appears only once per year ((A, B) = (B, A))
collaboration_strength['org_pair'] = collaboration_strength.apply(
lambda row: tuple(sorted([row['org1'], row['org2']])), axis=1)
collaboration_strength = collaboration_strength.groupby(
['year', 'org_pair'])['strength'].sum().reset_index()
# split the org_pair back into separate columns
collaboration_strength[['org1', 'org2']] = pd.DataFrame(
collaboration_strength['org_pair'].tolist(), index=collaboration_strength.index)
# drop the org_pair column
collaboration_strength = collaboration_strength.drop('org_pair', axis=1)
# sort
collaboration_strength = collaboration_strength.sort_values(
['year', 'strength'], ascending=[True, False])
# reset the index
collaboration_strength = collaboration_strength.reset_index(drop=True)collaboration_strength.sort_values(by='strength', ascending=False).head(10)| year | strength | org1 | org2 | |
|---|---|---|---|---|
| 2009 | 2022 | 39 | Institutes of Science and Development, CAS | University of CAS |
| 1354 | 2020 | 29 | Institutes of Science and Development, CAS | University of CAS |
| 2350 | 2023 | 22 | Institutes of Science and Development, CAS | University of CAS |
| 1355 | 2020 | 17 | Institute of Geographic Sciences and Natural Resources Research, CAS | University of CAS |
| 99 | 2016 | 16 | Institute of Geographic Sciences and Natural Resources Research, CAS | University of CAS |
| 1636 | 2021 | 15 | Institutes of Science and Development, CAS | University of CAS |
| 1111 | 2019 | 14 | Institutes of Science and Development, CAS | University of CAS |
| 100 | 2016 | 13 | CAS | Institute of Geographic Sciences and Natural Resources Research, CAS |
| 1637 | 2021 | 13 | CAS | University of CAS |
| 1356 | 2020 | 10 | CAS | University of CAS |
import networkx as nx
G = nx.Graph()
# Add edges with weights
for _, row in collaboration_strength.iterrows():
G.add_edge(row['org1'], row['org2'], weight=row['strength'])
# Calculate total strength for each organization
org_strength = {}
for org in G.nodes():
org_strength[org] = sum(G[org][neighbor]['weight'] for neighbor in G[org])
# Set position layout
pos = nx.spring_layout(G, k=0.5, iterations=50)
# Create edge trace
edge_x = []
edge_y = []
edge_widths = []
for edge in G.edges(data=True):
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
edge_x.extend([x0, x1, None])
edge_y.extend([y0, y1, None])
# Calculate edge width based on the strength of collaboration
edge_width = edge[2]['weight'] * 0.3 # Adjust this multiplier as needed
edge_widths.append(edge_width)
edge_trace = go.Scatter(
x=edge_x, y=edge_y,
line=dict(color='#0E86D4', width=1),
hoverinfo='none',
mode='lines')
# Create node trace
node_x = []
node_y = []
for node in G.nodes():
x, y = pos[node]
node_x.append(x)
node_y.append(y)
node_trace = go.Scatter(
x=node_x, y=node_y,
mode='markers',
hoverinfo='text',
marker=dict(
showscale=True,
colorscale='Blues',
reversescale=False,
color=[],
size=10,
colorbar=dict(
thickness=15,
title='Connections',
xanchor='left',
titleside='right',
title_font_family="Verdana",
tickfont_family="Verdana"
),
line_width=2))
# Color node points by the number of connections
node_adjacencies = []
node_text = []
for node, adjacencies in G.adjacency():
num_connections = len(adjacencies)
node_adjacencies.append(num_connections)
node_text.append(
f'<span style="font-family: Verdana;">{node}<br>Connections: {num_connections}</span>')
node_trace.marker.color = node_adjacencies
node_trace.text = node_text
network_density = nx.density(G)
fig = go.Figure()
# Add edge traces with varying widths
for i in range(len(edge_x) // 3):
fig.add_trace(go.Scatter(
x=edge_x[i*3:(i+1)*3], y=edge_y[i*3:(i+1)*3],
line=dict(width=edge_widths[i], color='#0E86D4'),
hoverinfo='none',
mode='lines'
))
fig.add_trace(node_trace)
fig.update_layout(
template='plotly_white',
width=900,
height=600,
title='Collaboration Network, 2013-2023',
titlefont=dict(family="Verdana", size=16),
showlegend=False,
hovermode='closest',
margin=dict(b=20, l=5, r=5, t=40),
annotations=[dict(
text=f"Number of organizations: {len(G.nodes())}. Network Density: {network_density:.2%}",
showarrow=False,
xref="paper", yref="paper",
x=0.005, y=-0.002,
font=dict(family="Verdana")
)],
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
font=dict(family="Verdana")
)
fig.show()def process_data(collaboration_strength):
collaboration_strength['year'] = pd.to_datetime(
collaboration_strength['year'], format='%Y')
years = sorted(collaboration_strength['year'].unique())
graphs_by_year = {}
all_nodes = set()
for year in years:
year_data = collaboration_strength[collaboration_strength['year'] == year]
G = nx.Graph()
for _, row in year_data.iterrows():
G.add_edge(row['org1'], row['org2'], weight=row['strength'])
all_nodes.add(row['org1'])
all_nodes.add(row['org2'])
graphs_by_year[year] = G
return graphs_by_year, all_nodes, years
def plot_graph(G, pos, fig):
edge_x, edge_y = [], []
for edge in G.edges():
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
edge_x.extend([x0, x1, None])
edge_y.extend([y0, y1, None])
edge_trace = go.Scatter(
x=edge_x, y=edge_y,
line=dict(color='#0E86D4', width=1),
hoverinfo='none',
mode='lines'
)
node_x, node_y = [], []
for node in G.nodes():
x, y = pos[node]
node_x.append(x)
node_y.append(y)
node_adjacencies = [len(list(G.adj[node])) for node in G.nodes()]
node_text = [f'{node}<br>Connections: {adj}' for node,
adj in zip(G.nodes(), node_adjacencies)]
node_trace = go.Scatter(
x=node_x, y=node_y,
mode='markers',
hoverinfo='text',
marker=dict(
showscale=True,
colorscale='Blues',
reversescale=False,
color=node_adjacencies,
size=10,
colorbar=dict(thickness=15, title='Connections',
xanchor='left', titleside='right'),
line_width=2
),
text=node_text
)
fig.add_trace(edge_trace)
fig.add_trace(node_trace)
return fig
def create_visualization(collaboration_strength):
graphs_by_year, all_nodes, years = process_data(collaboration_strength)
# Create a combined graph for consistent node positioning
G_combined = nx.Graph()
for G in graphs_by_year.values():
G_combined = nx.compose(G_combined, G)
pos = nx.spring_layout(G_combined, k=0.6, iterations=50)
fig = go.Figure()
for year in years:
fig = plot_graph(G=graphs_by_year[year], pos=pos, fig=fig)
# Set visibility for traces
visibility = [False] * len(fig.data)
visibility[0] = True
fig.update_traces(visible=False)
fig.data[0].visible = True
fig.data[1].visible = True
steps = []
for i, year in enumerate(years):
year_str = year.strftime('%Y')
visibility = [False] * len(fig.data)
start_idx = i * 2
end_idx = (i + 1) * 2
for j in range(start_idx, end_idx):
visibility[j] = True
network_density = nx.density(graphs_by_year[year])
step = dict(
method="update",
args=[
{"visible": visibility},
{"annotations": [dict(
text=f"Number of organizations: {len(graphs_by_year[year].nodes())}<br>Network Density: {network_density:.2%}",
showarrow=False,
xref="paper", yref="paper",
x=0.005, y=-0.002,
font=dict(family="Verdana")
)]}
],
label=f"{year_str}"
)
steps.append(step)
initial_network_density = nx.density(graphs_by_year[years[0]])
fig.update_layout(
template='plotly_white',
width=900,
height=600,
title='Collaboration Network by Year, 2013-2023',
titlefont=dict(family="Verdana", size=16),
showlegend=False,
hovermode='closest',
margin=dict(b=20, l=5, r=5, t=40),
annotations=[dict(
text=f"Number of organizations: {len(graphs_by_year[years[0]].nodes())}<br>Network Density: {initial_network_density:.2%}",
showarrow=False,
xref="paper", yref="paper",
x=0.005, y=-0.002,
font=dict(family="Verdana")
)],
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
font=dict(family="Verdana"),
sliders=[{
"active": 0,
"currentvalue": {"visible": False},
"pad": {"t": 50},
"steps": steps
}],
)
return fig
fig = create_visualization(collaboration_strength)
fig.show()Fund Projects
fund_projects = pd.read_csv('data/fund_projects_flat.csv')
fund_projects.shape(1742, 4)
# filter out 2024
fund_projects = fund_projects[fund_projects['year'] < 2024]
fund_projects.shape(1682, 4)
fund_projects.fund_project.nunique()868
gp = fund_projects.groupby('year')['fund_project'].count().reset_index()
gp = gp[gp['fund_project'] > 0]
fig = px.bar(gp,
x='year',
y='fund_project'
)
fig.update_layout(
layout,
title='Number of Fund Projects by Year',
xaxis=dict(
title="",
type='category',
),
yaxis=dict(title="Fund Projects"),
margin=dict(r=50, l=50, b=0, t=50)
)
fig.update_traces(
textposition='outside',
texttemplate='%{y}',
textfont=dict(color='black', size=14, family='Verdana'),
marker_color='#0E86D4',
opacity=0.7,
hovertemplate='Year: %{x}<br>Fund Projects: %{y}<extra></extra>'
)
fig.show()gp = df[df['year'] < 2024].groupby('year')['title_cn'].count().reset_index()
fund_count = df.groupby('year')['fund_project'].count().reset_index()
gp = gp.merge(fund_count, on='year', how='left')
gp = gp[gp['fund_project'] > 0]
gp['fund_per_article'] = gp['fund_project'] / gp['title_cn']
fig = px.bar(gp,
x='year',
y='fund_per_article',
)
fig.update_layout(
layout,
#title='Number of Fund Projects by Year',
xaxis=dict(
title="",
type='category',
),
yaxis=dict(title="Fund Projects"),
)
fig.update_traces(
textposition='outside',
texttemplate='%{y:.0%}',
textfont=dict(color='black', size=14, family='Verdana'),
marker_color='#0E86D4',
opacity=0.7,
hovertemplate='Year: %{x}<br>Fund Projects per Article: %{y:.2f}<extra></extra>'
)
fig.show()Keywords
Article Description Keywords
keywords = df[['year', 'title_cn', 'keywords_cn']].copy()
keywords = keywords.dropna(subset=['keywords_cn'])keywords.head()| year | title_cn | keywords_cn | |
|---|---|---|---|
| 169 | 2023 | 油气安全战略与“双碳”战略:关系与路径 | 碳达峰,碳中和,油气安全,关系,路径,战略 |
| 170 | 2023 | 我国煤炭主体能源安全高质量发展的理论技术思考 | 能源安全,煤炭智能精准开采,清洁高效利用,碳中和科学发展 |
| 171 | 2023 | 新形势下我国能源高质量发展与能源安全 | 能源安全,高质量发展,综合能源保障体系,全方位安全观,能源与矿业治理 |
| 172 | 2023 | 页岩油开发利用及在能源中的作用 | 页岩油,能源安全,开发利用,能源体系,政策建议,中国 |
| 173 | 2023 | 碳中和目标下中国新能源使命 | 碳达峰,碳中和,碳中和学,新能源,能源转型,能源独立,碳中和社会 |
keywords.shape(4962, 3)
import regex as re
# replace punctuations with comas
keywords.keywords_cn = (keywords.keywords_cn.str.replace('[,,;;()!!~]', ',', regex=True)
.str.replace(r',+', ',', regex=True))
# flatten the keywords list
keywords['keywords_list'] = keywords.keywords_cn.str.split(',')
keywords_flat = keywords.explode('keywords_list').rename(
columns={'keywords_list': 'keyword'})
keywords_flat.keyword = (keywords_flat.keyword.str.rstrip(' ')
.str.lstrip(' '))keywords_flat.head()| year | title_cn | keywords_cn | keyword | |
|---|---|---|---|---|
| 169 | 2023 | 油气安全战略与“双碳”战略:关系与路径 | 碳达峰,碳中和,油气安全,关系,路径,战略 | 碳达峰 |
| 169 | 2023 | 油气安全战略与“双碳”战略:关系与路径 | 碳达峰,碳中和,油气安全,关系,路径,战略 | 碳中和 |
| 169 | 2023 | 油气安全战略与“双碳”战略:关系与路径 | 碳达峰,碳中和,油气安全,关系,路径,战略 | 油气安全 |
| 169 | 2023 | 油气安全战略与“双碳”战略:关系与路径 | 碳达峰,碳中和,油气安全,关系,路径,战略 | 关系 |
| 169 | 2023 | 油气安全战略与“双碳”战略:关系与路径 | 碳达峰,碳中和,油气安全,关系,路径,战略 | 路径 |
keywords_stats = keywords_flat['keyword'].value_counts(
).reset_index().sort_values(ascending=False, by='count')
def get_keywords_stats(keywords_stats):
keywords_stats['share'] = 100. * \
keywords_stats['count']/keywords_stats['count'].sum()
keywords_stats['cumulative_share'] = 100. * \
keywords_stats['count'].cumsum()/keywords_stats['count'].sum()
get_keywords_stats(keywords_stats)keywords_stats.head(25)| keyword | count | share | cumulative_share | |
|---|---|---|---|---|
| 0 | 中国科学院 | 820 | 3.060387 | 3.060387 |
| 1 | 基础研究 | 107 | 0.399343 | 3.459730 |
| 2 | 可持续发展 | 103 | 0.384414 | 3.844144 |
| 3 | 科学家 | 83 | 0.309771 | 4.153915 |
| 4 | 研究所 | 83 | 0.309771 | 4.463686 |
| 5 | 中国科学院院士 | 81 | 0.302306 | 4.765992 |
| 6 | 建议 | 76 | 0.283646 | 5.049638 |
| 7 | 学部委员 | 75 | 0.279913 | 5.329551 |
| 8 | 国际合作 | 72 | 0.268717 | 5.598268 |
| 9 | 中国 | 70 | 0.261253 | 5.859521 |
| 11 | 科技创新 | 59 | 0.220199 | 6.079719 |
| 10 | 中国科学院学部 | 59 | 0.220199 | 6.299918 |
| 12 | 知识创新工程 | 53 | 0.197805 | 6.497723 |
| 13 | 科学技术 | 53 | 0.197805 | 6.695529 |
| 14 | 气候变化 | 51 | 0.190341 | 6.885870 |
| 15 | 院士 | 49 | 0.182877 | 7.068747 |
| 16 | 国家重点实验室 | 49 | 0.182877 | 7.251623 |
| 17 | 中科院 | 48 | 0.179145 | 7.430768 |
| 18 | 创新 | 47 | 0.175412 | 7.606180 |
| 19 | 对策 | 44 | 0.164216 | 7.770396 |
| 21 | 研究成果 | 40 | 0.149287 | 7.919684 |
| 20 | 发展 | 40 | 0.149287 | 8.068971 |
| 22 | 青藏高原 | 37 | 0.138091 | 8.207061 |
| 27 | 碳中和 | 36 | 0.134358 | 8.341420 |
| 29 | 改革 | 36 | 0.134358 | 8.475778 |
keywords_stats['count'].describe()count 14530.000000
mean 1.844047
std 7.613227
min 1.000000
25% 1.000000
50% 1.000000
75% 1.000000
max 820.000000
Name: count, dtype: float64
fig = px.bar(keywords_stats.head(20), x='count', y='keyword', orientation='h')
fig.update_layout(
layout,
title='Top 20 Keywords between 2013-2023',
width=700,
height=600,
xaxis=dict(range=[0, keywords_stats['count'].max() * 1.1]),
yaxis=dict(
title="Word Count",
autorange='reversed'
),
margin=dict(r=50, l=50, b=0, t=50)
)
fig.update_traces(
textposition='outside',
texttemplate='%{x}',
textfont=dict(color='black'),
marker_color='#0E86D4',
opacity=0.7,
hovertemplate='Keyword: %{y}<br>Count: %{x}<extra></extra>'
)
fig.show()NameError: name 'px' is not defined
keywords_top_year = keywords_flat.groupby(
'year')['keyword'].value_counts().reset_index()
keywords_top_year| year | keyword | count | |
|---|---|---|---|
| 0 | 1986 | 中国科学院 | 26 |
| 1 | 1986 | 研究所 | 7 |
| 2 | 1986 | 基础研究 | 5 |
| 3 | 1986 | 平装 | 4 |
| 4 | 1986 | 新书 | 4 |
| ... | ... | ... | ... |
| 21843 | 2023 | 高原湖泊 | 1 |
| 21844 | 2023 | 高影响专利 | 1 |
| 21845 | 2023 | 高被引论文 | 1 |
| 21846 | 2023 | 高质量 | 1 |
| 21847 | 2023 | 黄土高原 | 1 |
21848 rows × 3 columns
from plotly.subplots import make_subplots
years = list(range(2013, 2024))
fig = make_subplots(
rows=6,
cols=2,
subplot_titles=[f'Top 10 Keywords in {year}' for year in years],
vertical_spacing=0.05,
horizontal_spacing=0.21
)
for i, year in enumerate(years):
gp = (keywords_flat[keywords_flat['year'] == year].keyword.value_counts(
normalize=True) * 100).reset_index().head(10)
fig.add_trace(
go.Bar(
x=gp['proportion'],
y=gp['keyword'],
orientation='h',
marker_color='#0E86D4',
opacity=0.7,
text=gp['proportion'].apply(lambda x: f'{x:.2f}%'),
textposition='outside',
textfont=dict(color='black'),
hovertemplate='Keyword: %{y}<br>Share: %{x:.2f}%<extra></extra>'
),
row=(i // 2) + 1, col=(i % 2) + 1 # Calculate row and column
)
fig.update_xaxes(title_text="Share", range=[
0, 4.5], showline=True, linecolor='black')
fig.update_yaxes(autorange="reversed", ticklabelposition="outside", row=(
i // 2) + 1, col=(i % 2) + 1, showline=True, linecolor='black')
fig.update_layout(
layout,
height=2000,
width=800,
showlegend=False,
)
fig.show()Jieba Tokenization
import jieba
# clean and preprocess Chinese with jieba
def tokenize(text):
# regular expression to remove non-Chinese characters
pattern = re.compile(r'[^\u4e00-\u9fa5]')
text = re.sub(pattern, '', text)
# tokenize
words = jieba.cut(text, cut_all=False)
words = [w for w in words]
return ' '.join(words)keywords['keywords_tokenized'] = keywords.keywords_cn.apply(tokenize)Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/fm/r7lztspd5m77mjytcdb5636w0000gn/T/jieba.cache
Loading model cost 0.442 seconds.
Prefix dict has been built successfully.
keywords[['keywords_cn', 'keywords_tokenized']].head()| keywords_cn | keywords_tokenized | |
|---|---|---|
| 169 | 碳达峰,碳中和,油气安全,关系,路径,战略 | 碳达峰 碳 中 和 油气 安全 关系 路径 战略 |
| 170 | 能源安全,煤炭智能精准开采,清洁高效利用,碳中和科学发展 | 能源安全 煤炭 智能 精准 开采 清洁 高效 利用 碳中 和 科学 发展 |
| 171 | 能源安全,高质量发展,综合能源保障体系,全方位安全观,能源与矿业治理 | 能源安全 高质量 发展 综合 能源 保障体系 全方位 安全观 能源 与 矿业 治理 |
| 172 | 页岩油,能源安全,开发利用,能源体系,政策建议,中国 | 页岩 油 能源安全 开发利用 能源 体系 政策 建议 中国 |
| 173 | 碳达峰,碳中和,碳中和学,新能源,能源转型,能源独立,碳中和社会 | 碳达峰 碳 中 和 碳 中和学 新能源 能源 转型 能源 独立 碳中 和 社会 |
from sklearn.feature_extraction.text import CountVectorizer
# create a CountVectorizer object
vectorizer = CountVectorizer()
# fit and transform the data
X = vectorizer.fit_transform(keywords['keywords_tokenized'])
# Get the word frequencies
jieba_word_counts = pd.DataFrame(
X.sum(axis=0), columns=vectorizer.get_feature_names_out()).Tjieba_word_counts = jieba_word_counts.sort_values(
ascending=False, by=0).reset_index()
jieba_word_counts = jieba_word_counts.rename(
columns={'index': 'keyword', 0: 'count'})def get_keywords_stats(jieba_word_counts):
jieba_word_counts['share'] = jieba_word_counts['count'] / \
jieba_word_counts['count'].sum() * 100
jieba_word_counts['cumulative_share'] = jieba_word_counts['count'].cumsum(
)/jieba_word_counts['count'].sum() * 100
return jieba_word_counts[['keyword', 'count', 'share', 'cumulative_share']]
jieba_word_counts_stats = get_keywords_stats(jieba_word_counts)jieba_word_counts_stats.head(25)| keyword | count | share | cumulative_share | |
|---|---|---|---|---|
| 0 | 中国科学院 | 991 | 2.219833 | 2.219833 |
| 1 | 研究 | 823 | 1.843514 | 4.063347 |
| 2 | 发展 | 728 | 1.630715 | 5.694062 |
| 3 | 科技 | 647 | 1.449275 | 7.143337 |
| 4 | 创新 | 535 | 1.198396 | 8.341733 |
| 5 | 科学 | 534 | 1.196156 | 9.537889 |
| 6 | 技术 | 417 | 0.934077 | 10.471966 |
| 7 | 研究所 | 342 | 0.766078 | 11.238044 |
| 8 | 国家 | 314 | 0.703358 | 11.941402 |
| 9 | 中国 | 277 | 0.620478 | 12.561880 |
| 10 | 战略 | 271 | 0.607038 | 13.168918 |
| 11 | 生态 | 265 | 0.593598 | 13.762516 |
| 12 | 合作 | 262 | 0.586878 | 14.349394 |
| 13 | 国际 | 256 | 0.573438 | 14.922832 |
| 14 | 工程 | 256 | 0.573438 | 15.496270 |
| 15 | 院士 | 232 | 0.519678 | 16.015949 |
| 16 | 生物 | 230 | 0.515198 | 16.531147 |
| 17 | 实验室 | 222 | 0.497278 | 17.028426 |
| 18 | 工作 | 217 | 0.486078 | 17.514504 |
| 19 | 基础 | 213 | 0.477118 | 17.991622 |
| 20 | 持续 | 212 | 0.474878 | 18.466501 |
| 21 | 经济 | 200 | 0.447999 | 18.914499 |
| 22 | 科学技术 | 185 | 0.414399 | 19.328898 |
| 23 | 物理 | 184 | 0.412159 | 19.741057 |
| 24 | 社会 | 181 | 0.405439 | 20.146496 |
fig = px.bar(
jieba_word_counts_stats.head(20),
x='count',
y='keyword',
orientation='h'
)
fig.update_layout(
layout,
title='Top 20 Tokens between 2013-2023',
width=700,
height=600,
xaxis=dict(
range=[0, jieba_word_counts_stats['count'].max() * 1.1],
title="Token Count"),
yaxis=dict(
autorange='reversed'
),
margin=dict(r=50, l=50, b=0, t=50)
)
fig.update_traces(
textposition='outside',
texttemplate='%{x}',
textfont=dict(color='black'),
marker_color='#0E86D4',
opacity=0.7,
hovertemplate='Token: %{y}<br>Count: %{x}<extra></extra>'
)
fig.show()keywords.keywords_tokenized = keywords.keywords_tokenized.apply(
lambda x: x.split())keywords.head()| year | title_cn | keywords_cn | keywords_list | keywords_tokenized | |
|---|---|---|---|---|---|
| 169 | 2023 | 油气安全战略与“双碳”战略:关系与路径 | 碳达峰,碳中和,油气安全,关系,路径,战略 | [碳达峰, 碳中和, 油气安全, 关系, 路径, 战略] | [碳达峰, 碳, 中, 和, 油气, 安全, 关系, 路径, 战略] |
| 170 | 2023 | 我国煤炭主体能源安全高质量发展的理论技术思考 | 能源安全,煤炭智能精准开采,清洁高效利用,碳中和科学发展 | [能源安全, 煤炭智能精准开采, 清洁高效利用, 碳中和科学发展] | [能源安全, 煤炭, 智能, 精准, 开采, 清洁, 高效, 利用, 碳中, 和, 科学, 发展] |
| 171 | 2023 | 新形势下我国能源高质量发展与能源安全 | 能源安全,高质量发展,综合能源保障体系,全方位安全观,能源与矿业治理 | [能源安全, 高质量发展, 综合能源保障体系, 全方位安全观, 能源与矿业治理] | [能源安全, 高质量, 发展, 综合, 能源, 保障体系, 全方位, 安全观, 能源, 与, 矿业, 治理] |
| 172 | 2023 | 页岩油开发利用及在能源中的作用 | 页岩油,能源安全,开发利用,能源体系,政策建议,中国 | [页岩油, 能源安全, 开发利用, 能源体系, 政策建议, 中国] | [页岩, 油, 能源安全, 开发利用, 能源, 体系, 政策, 建议, 中国] |
| 173 | 2023 | 碳中和目标下中国新能源使命 | 碳达峰,碳中和,碳中和学,新能源,能源转型,能源独立,碳中和社会 | [碳达峰, 碳中和, 碳中和学, 新能源, 能源转型, 能源独立, 碳中和社会] | [碳达峰, 碳, 中, 和, 碳, 中和学, 新能源, 能源, 转型, 能源, 独立, 碳中, 和, 社会] |
keywords_jieba_flat = keywords.explode('keywords_tokenized').rename(
columns={'keywords_tokenized': 'keyword_jieba'})keywords_jieba_flat.head()| year | title_cn | keywords_cn | keywords_list | keyword_jieba | |
|---|---|---|---|---|---|
| 169 | 2023 | 油气安全战略与“双碳”战略:关系与路径 | 碳达峰,碳中和,油气安全,关系,路径,战略 | [碳达峰, 碳中和, 油气安全, 关系, 路径, 战略] | 碳达峰 |
| 169 | 2023 | 油气安全战略与“双碳”战略:关系与路径 | 碳达峰,碳中和,油气安全,关系,路径,战略 | [碳达峰, 碳中和, 油气安全, 关系, 路径, 战略] | 碳 |
| 169 | 2023 | 油气安全战略与“双碳”战略:关系与路径 | 碳达峰,碳中和,油气安全,关系,路径,战略 | [碳达峰, 碳中和, 油气安全, 关系, 路径, 战略] | 中 |
| 169 | 2023 | 油气安全战略与“双碳”战略:关系与路径 | 碳达峰,碳中和,油气安全,关系,路径,战略 | [碳达峰, 碳中和, 油气安全, 关系, 路径, 战略] | 和 |
| 169 | 2023 | 油气安全战略与“双碳”战略:关系与路径 | 碳达峰,碳中和,油气安全,关系,路径,战略 | [碳达峰, 碳中和, 油气安全, 关系, 路径, 战略] | 油气 |
from plotly.subplots import make_subplots
years = list(range(2013, 2024))
fig = make_subplots(
rows=6,
cols=2,
subplot_titles=[f'Top 10 Tokens in {year}' for year in years],
vertical_spacing=0.05,
horizontal_spacing=0.2
)
for i, year in enumerate(years):
gp = (keywords_jieba_flat[keywords_jieba_flat['year'] == year].keyword_jieba.value_counts(
normalize=True)*100).reset_index().head(10)
fig.add_trace(
go.Bar(
x=gp['proportion'],
y=gp['keyword_jieba'],
orientation='h',
marker_color='#0E86D4',
opacity=0.7,
text=gp['proportion'].apply(lambda x: f'{x:.2f}%'),
textposition='outside',
textfont=dict(color='black'),
hovertemplate='Keyword: %{y}<br>Share: %{x:.2f}%<extra></extra>'
),
row=(i // 2) + 1, col=(i % 2) + 1 # Calculate row and column
)
fig.update_xaxes(title_text="Share", range=[
0, 4.5], showline=True, linecolor='black')
fig.update_yaxes(autorange="reversed", ticklabelposition="outside", row=(
i // 2) + 1, col=(i % 2) + 1, showline=True, linecolor='black')
fig.update_layout(
layout,
height=2000,
width=800,
showlegend=False,
)
fig.show()# get the index of the keyword where cumulative_share >= 80%
for index, row in jieba_word_counts_stats.iterrows():
if row['cumulative_share'] >= 80:
keyword_index = index + 1
break
top_words_share = keyword_index / jieba_word_counts_stats.keyword.nunique() * \
100
print(
f'Share of Words that make up 80% of words: {round(top_words_share, 2)}%')Share of Words that make up 80% of words: 21.48%
jieba_word_counts_stats.head()| keyword | count | share | cumulative_share | |
|---|---|---|---|---|
| 0 | 中国科学院 | 991 | 2.219833 | 2.219833 |
| 1 | 研究 | 823 | 1.843514 | 4.063347 |
| 2 | 发展 | 728 | 1.630715 | 5.694062 |
| 3 | 科技 | 647 | 1.449275 | 7.143337 |
| 4 | 创新 | 535 | 1.198396 | 8.341733 |
fig = px.line(jieba_word_counts_stats, x=jieba_word_counts_stats.keyword.index,
y='cumulative_share', markers=True)
fig.update_layout(
layout,
yaxis=dict(title="Cumulative Share"),
xaxis=dict(title="Word Index", minor=dict(showgrid=True),),
title=f'Cumulative Distribution of Tokens',
margin=dict(r=50, l=50, b=0, t=50)
)
fig.update_traces(
marker_color='#0E86D4',
opacity=0.7
)
fig.show()fig = px.scatter(jieba_word_counts_stats, x=jieba_word_counts_stats.keyword.index, y="count")
fig.update_layout(
layout,
yaxis=dict(title="Token Frequency"),
xaxis=dict(title="Token Index", minor=dict(showgrid=True),),
)
fig.update_traces(
marker_color='#0E86D4',
opacity=0.7,
hovertemplate='Frequency: %{y}<br>Index: %{x}<extra></extra>'
)
fig.show()Wordcloud
from PIL import Image
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Sample DataFrame
font_path = '/Library/Fonts/STHeiti Light.ttc'
#mask_image = np.array(Image.open('/Users/dmitrijmazanik/Downloads/China-Map-PNG-Pic.png'))
# Flatten the list of tokenized words into a single string
all_words = ' '.join(keywords['keywords_tokenized'].sum())
# Create the word cloud
wc = WordCloud(
#mask=mask_image,
font_path=font_path,
max_words=1500,
max_font_size=100,
random_state=42,
width=800, height=1000,
contour_width=1,
background_color="white", # Set to None to utilize mask colors
colormap='PuBu', # You can choose a colormap that fits your design
).generate(all_words)
#image_colors_default = ImageColorGenerator(mask_image)
# Display the word cloud
plt.figure()
plt.imshow(wc, interpolation='bilinear')
plt.axis('off') # Turn off axis
plt.show()